//
// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
//
// SPDX-License-Identifier: Apache-2.0
//

// Do not flag up inline assembly blocks
#pragma GCC diagnostic ignored "-Woverlength-strings"

#if !defined(__aarch64__) || !defined(__ARM_FEATURE_SVE2)
#error This file must be compiled for AArch64, FEAT_SVE2.
#else  // Architectural features check.

#include "kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot.h"

#include <arm_neon.h>
#include <stddef.h>
#include <stdint.h>

#include "kai/kai_common.h"

static const size_t kai_m_step = 1;
static const size_t kai_nr = 2;
static const size_t kai_n_step = 16;
static const size_t kai_kr = 2;
static const size_t kai_sr = 1;

size_t kai_get_m_step_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(void) {
    return kai_m_step;
}

size_t kai_get_n_step_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(void) {
    return kai_n_step * kai_get_sme_vector_length_u16() / kai_kr;
}

size_t kai_get_nr_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(void) {
    return kai_nr * kai_get_sme_vector_length_u16() / kai_kr;
}

size_t kai_get_kr_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(void) {
    return kai_kr;
}

size_t kai_get_sr_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(void) {
    return kai_sr;
}

size_t kai_get_lhs_packed_offset_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(size_t m_idx, size_t k) {
    KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot() == 0);

    return m_idx * k;
}

static size_t kai_get_rhs_packed_stride_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(size_t k) {
    return kai_get_n_step_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot() *
        ((kai_roundup(k, kai_kr) * sizeof(uint16_t) + sizeof(uint16_t)));
}

size_t kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(size_t n_idx, size_t k) {
    KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot() == 0);

    const size_t block_idx = n_idx / kai_get_n_step_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot();
    return block_idx * kai_get_rhs_packed_stride_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(k);
}

size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(
    size_t m_idx, size_t n_idx, size_t dst_stride) {
    KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot() == 0);
    KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot() == 0);

    return (m_idx * dst_stride) + (n_idx * sizeof(uint16_t));
}

size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(size_t m, size_t n) {
    return m * n * sizeof(uint16_t);
}

void kai_run_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot(
    size_t m, size_t n, size_t k, const void* lhs, size_t lhs_stride, const void* rhs_packed, void* dst,
    size_t dst_stride_row, size_t dst_stride_col, float clamp_min, float clamp_max) {
    KAI_UNUSED(dst_stride_row);
    KAI_UNUSED(dst_stride_col);
    KAI_UNUSED(lhs_stride);
    KAI_ASSERT(m == 1);

    typedef struct {
        float16_t maxval;
        float16_t minval;
    } KernelArgs;

    KernelArgs ka;
    ka.maxval = (float16_t)clamp_max;
    ka.minval = (float16_t)clamp_min;

    size_t N = n;
    size_t K = k;

    const void* A_ptr = lhs;
    const void* B_ptr = rhs_packed;
    void* output_ptr = dst;

    uint64_t flags = 0;

    __asm__ __volatile__(
        ".inst 0xd503477f  // SMSTART ZA\n"
        "mov x8, #0x0\n"
        "mov x16, %x[B_ptr]\n"
        "cntw x15, ALL, MUL #4\n"
        "mov x14, %x[output_ptr]\n"
        "add x13, %x[N], x15\n"
        "ptrue p1.b\n"
        "sub x13, x13, #0x1\n"
        ".inst 0x25207811  // ptrue pn9.b\n"
        "udiv x13, x13, x15\n"
        "mov x22, #0x1\n"
        "add x21, x13, #0x3\n"
        "and x21, x21, #0xfffffffffffffffc\n"
        "mul x21, x21, x15\n"
        "mul x21, x21, %x[K]\n"
        "lsl x21, x21, #0x1\n"
        "1:"  // RHS size check loop
        "cmp x21, #0x200000\n"
        "blt 2f\n"
        "tbnz x21, #0, 3f\n"
        "lsr x21, x21, #0x1\n"
        "lsl x22, x22, #0x1\n"
        "b 1b\n"
        "2:"  // RHS do prefetch
        "lsl x20, x21, #0x26\n"
        "sub x22, x22, #0x1\n"
        "lsl x22, x22, #0x16\n"
        "orr x21, x21, x20\n"
        "orr x21, x21, x22\n"
        ".inst 0xf8b54a1a  // rprfm pldonce, x21, [x16]\n"
        "3:"  // RHS prefetch exit
        "add x12, %x[K], #0x1\n"
        "cntw x20, ALL, MUL #2\n"
        "bic x12, x12, #0x1\n"
        "lsl x12, x12, #0x1\n"
        "add x12, x12, #0x2\n"
        "mul x12, x12, x20\n"
        "4:"  // Column loop
        "cmp x13, #0x4\n"
        "bge 22f\n"
        "cmp x13, #0x2\n"
        "bgt 16f\n"
        "beq 10f\n"
        "cntw x20, ALL, MUL #2\n"
        "add x22, x16, x12\n"
        "ld1h { z8.s }, p1/Z, [x16]\n"
        "cmp %x[N], x20\n"
        "ld1h { z9.s }, p1/Z, [x16, #1, MUL VL]\n"
        "mov x11, %x[K]\n"
        "csel x22, x22, x16, GT\n"
        "mov x21, %x[N]\n"
        "ld1h { z10.s }, p1/Z, [x22]\n"
        "fcvt z8.s, p1/m, z8.h\n"
        "mov x10, %x[A_ptr]\n"
        "lsl x20, %x[K], #0x1\n"
        "ld1h { z11.s }, p1/Z, [x22, #1, MUL VL]\n"
        "fcvt z9.s, p1/m, z9.h\n"
        ".inst 0x257547f0  // whilelt p8.h, XZR, x21, VLx2\n"
        "cmp x11, #0x8\n"
        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
        "inch x16, ALL, MUL #2\n"
        "fcvt z10.s, p1/m, z10.h\n"
        "inch x22, ALL, MUL #2\n"
        "fcvt z11.s, p1/m, z11.h\n"
        ".inst 0xc0040d00  // mova za.d[x8, #0], { z8.d-z11.d }\n"
        "ble 6f\n"
        "5:"  // Width 1: Multiply loop: Main loop head
        "whilelt p0.h, XZR, x11\n"
        ".inst 0xa0402615  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        "ld1rqh { z4.h }, p0/Z, [x10]\n"
        "sub x11, x11, #0x8\n"
        "add x10, x10, #0x10\n"
        ".inst 0xa04026d7  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        "cmp x11, #0x8\n"
        ".inst 0xa0402609  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026cb  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1549288  // fdot za.s[x8, 0], { z20.h-z23.h }, z4.h[0]\n"
        ".inst 0xa040260d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026cf  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0402615  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026d7  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1549508  // fdot za.s[x8, 0], { z8.h-z11.h }, z4.h[1]\n"
        ".inst 0xc1549988  // fdot za.s[x8, 0], { z12.h-z15.h }, z4.h[2]\n"
        ".inst 0xc1549e88  // fdot za.s[x8, 0], { z20.h-z23.h }, z4.h[3]\n"
        "bgt 5b\n"
        "6:"  // Width 1: Multiply loop: Single iteration only
        "whilelt p0.h, XZR, x11\n"
        ".inst 0xa040260d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "ld1rqh { z3.h }, p0/Z, [x10]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026cf  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1539188  // fdot za.s[x8, 0], { z12.h-z15.h }, z3.h[0]\n"
        "ble 7f\n"
        ".inst 0xa0402605  // ldnt1h { z4.h-z5.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026c7  // ldnt1h { z6.h-z7.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1539488  // fdot za.s[x8, 0], { z4.h-z7.h }, z3.h[1]\n"
        "ble 7f\n"
        ".inst 0xa040260d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026cf  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1539988  // fdot za.s[x8, 0], { z12.h-z15.h }, z3.h[2]\n"
        "ble 7f\n"
        ".inst 0xa0402611  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x16]\n"
        ".inst 0xa04026d3  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x22]\n"
        ".inst 0xc1539e08  // fdot za.s[x8, 0], { z16.h-z19.h }, z3.h[3]\n"
        "7:"  // Width 1: Multiply loop: multiply skip
        "tbz %x[flags], #1, 8f\n"
        ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
        "add x21, %x[args_ptr], %[offset_min]\n"
        "add x20, %x[args_ptr], %[offset_max]\n"
        "ld1rh { z19.h }, p1/Z, [x21]\n"
        "ld1rh { z22.h }, p1/Z, [x20]\n"
        ".inst 0xc120e094  // fcvt z20.h, { z4.s-z5.s }\n"
        ".inst 0xc120e0d5  // fcvt z21.h, { z6.s-z7.s }\n"
        ".inst 0xc176c274  // fclamp { z20.h-z21.h }, z19.h, z22.h\n"
        ".inst 0xa06021d4  // st1h { z20.h-z21.h }, p8, [x14]\n"
        "b 9f\n"
        "8:"  // Width 1: No activation
        ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
        ".inst 0xc120e084  // fcvt z4.h, { z4.s-z5.s }\n"
        ".inst 0xc120e0c5  // fcvt z5.h, { z6.s-z7.s }\n"
        ".inst 0xa06021c4  // st1h { z4.h-z5.h }, p8, [x14]\n"
        "9:"  // Width 1: Output done
        "b 28f\n"
        "10:"  // Width 2
        "add x24, x16, x12, LSL #1\n"
        "cntw x20, ALL, MUL #6\n"
        "ld1h { z24.s }, p1/Z, [x16]\n"
        "add x23, x24, x12\n"
        "cmp %x[N], x20\n"
        "ld1h { z25.s }, p1/Z, [x16, #1, MUL VL]\n"
        "add x22, x16, x12\n"
        "csel x23, x23, x16, GT\n"
        "ld1h { z0.s }, p1/Z, [x24]\n"
        "ld1h { z26.s }, p1/Z, [x22]\n"
        "fcvt z24.s, p1/m, z24.h\n"
        "mov x11, %x[K]\n"
        "sub x21, %x[N], x15\n"
        "ld1h { z27.s }, p1/Z, [x22, #1, MUL VL]\n"
        "fcvt z25.s, p1/m, z25.h\n"
        "mov x10, %x[A_ptr]\n"
        "lsl x20, %x[K], #0x1\n"
        "ld1h { z1.s }, p1/Z, [x24, #1, MUL VL]\n"
        "fcvt z0.s, p1/m, z0.h\n"
        ".inst 0x257547f0  // whilelt p8.h, XZR, x21, VLx2\n"
        "cmp x11, #0x8\n"
        "ld1h { z2.s }, p1/Z, [x23]\n"
        "fcvt z26.s, p1/m, z26.h\n"
        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
        "inch x16, ALL, MUL #2\n"
        "ld1h { z3.s }, p1/Z, [x23, #1, MUL VL]\n"
        "fcvt z27.s, p1/m, z27.h\n"
        "inch x22, ALL, MUL #2\n"
        "inch x24, ALL, MUL #2\n"
        "fcvt z1.s, p1/m, z1.h\n"
        "inch x23, ALL, MUL #2\n"
        "fcvt z2.s, p1/m, z2.h\n"
        "fcvt z3.s, p1/m, z3.h\n"
        ".inst 0xc0040f00  // mova za.d[x8, #0], { z24.d-z27.d }\n"
        ".inst 0xc0040c01  // mova za.d[x8, #1], { z0.d-z3.d }\n"
        "ble 12f\n"
        "11:"  // Width 2: Multiply loop: Main loop head
        "whilelt p0.h, XZR, x11\n"
        ".inst 0xa0402615  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        "ld1rqh { z4.h }, p0/Z, [x10]\n"
        "sub x11, x11, #0x8\n"
        "add x10, x10, #0x10\n"
        ".inst 0xa04026d7  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        "cmp x11, #0x8\n"
        ".inst 0xa0402709  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04026eb  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1549288  // fdot za.s[x8, 0], { z20.h-z23.h }, z4.h[0]\n"
        ".inst 0xa0402615  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026d7  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1549109  // fdot za.s[x8, 1], { z8.h-z11.h }, z4.h[0]\n"
        ".inst 0xa040270d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04026ef  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1549688  // fdot za.s[x8, 0], { z20.h-z23.h }, z4.h[1]\n"
        ".inst 0xa0402611  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026d3  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1549589  // fdot za.s[x8, 1], { z12.h-z15.h }, z4.h[1]\n"
        ".inst 0xa0402719  // ldnt1h { z24.h-z25.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04026fb  // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1549a08  // fdot za.s[x8, 0], { z16.h-z19.h }, z4.h[2]\n"
        ".inst 0xa0402615  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026d7  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xc1549b09  // fdot za.s[x8, 1], { z24.h-z27.h }, z4.h[2]\n"
        ".inst 0xa0402709  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04026eb  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1549e88  // fdot za.s[x8, 0], { z20.h-z23.h }, z4.h[3]\n"
        ".inst 0xc1549d09  // fdot za.s[x8, 1], { z8.h-z11.h }, z4.h[3]\n"
        "bgt 11b\n"
        "12:"  // Width 2: Multiply loop: Single iteration only
        "whilelt p0.h, XZR, x11\n"
        ".inst 0xa0402615  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "ld1rqh { z3.h }, p0/Z, [x10]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026d7  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa040270d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04026ef  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1539288  // fdot za.s[x8, 0], { z20.h-z23.h }, z3.h[0]\n"
        ".inst 0xc1539189  // fdot za.s[x8, 1], { z12.h-z15.h }, z3.h[0]\n"
        "ble 13f\n"
        ".inst 0xa0402611  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026d3  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0402709  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04026eb  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1539608  // fdot za.s[x8, 0], { z16.h-z19.h }, z3.h[1]\n"
        ".inst 0xc1539509  // fdot za.s[x8, 1], { z8.h-z11.h }, z3.h[1]\n"
        "ble 13f\n"
        ".inst 0xa0402619  // ldnt1h { z24.h-z25.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026db  // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0402711  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa04026f3  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1539b08  // fdot za.s[x8, 0], { z24.h-z27.h }, z3.h[2]\n"
        ".inst 0xc1539a09  // fdot za.s[x8, 1], { z16.h-z19.h }, z3.h[2]\n"
        "ble 13f\n"
        ".inst 0xa0402609  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x16]\n"
        ".inst 0xa04026cb  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x22]\n"
        ".inst 0xa0402705  // ldnt1h { z4.h-z5.h }, pn9.b/Z, [x24]\n"
        ".inst 0xa04026e7  // ldnt1h { z6.h-z7.h }, pn9.b/Z, [x23]\n"
        ".inst 0xc1539d08  // fdot za.s[x8, 0], { z8.h-z11.h }, z3.h[3]\n"
        ".inst 0xc1539c89  // fdot za.s[x8, 1], { z4.h-z7.h }, z3.h[3]\n"
        "13:"  // Width 2: Multiply loop: multiply skip
        "tbz %x[flags], #1, 14f\n"
        ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
        "add x21, %x[args_ptr], %[offset_min]\n"
        "add x20, %x[args_ptr], %[offset_max]\n"
        ".inst 0xc0060c2c  // mova { z12.d-z15.d }, za.d[x8, #1]\n"
        "ld1rh { z6.h }, p1/Z, [x21]\n"
        "ld1rh { z22.h }, p1/Z, [x20]\n"
        ".inst 0xc120e112  // fcvt z18.h, { z8.s-z9.s }\n"
        ".inst 0xc120e153  // fcvt z19.h, { z10.s-z11.s }\n"
        ".inst 0xc120e190  // fcvt z16.h, { z12.s-z13.s }\n"
        ".inst 0xc120e1d1  // fcvt z17.h, { z14.s-z15.s }\n"
        ".inst 0xc176c0d2  // fclamp { z18.h-z19.h }, z6.h, z22.h\n"
        ".inst 0xc176c0d0  // fclamp { z16.h-z17.h }, z6.h, z22.h\n"
        ".inst 0xa06025d2  // st1h { z18.h-z19.h }, pn9.b, [x14]\n"
        ".inst 0xa06121d0  // st1h { z16.h-z17.h }, p8, [x14, #0x2, MUL VL]\n"
        "b 15f\n"
        "14:"  // Width 2: No activation
        ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
        ".inst 0xc0060c34  // mova { z20.d-z23.d }, za.d[x8, #1]\n"
        ".inst 0xc120e39a  // fcvt z26.h, { z28.s-z29.s }\n"
        ".inst 0xc120e3db  // fcvt z27.h, { z30.s-z31.s }\n"
        ".inst 0xa06025da  // st1h { z26.h-z27.h }, pn9.b, [x14]\n"
        ".inst 0xc120e291  // fcvt z17.h, { z20.s-z21.s }\n"
        ".inst 0xc120e2d9  // fcvt z25.h, { z22.s-z23.s }\n"
        ".inst 0xa16121d1  // st1h { z17.h, z25.h }, p8, [x14, #0x2, MUL VL]\n"
        "15:"  // Width 2: Output done
        "b 28f\n"
        "16:"  // Width 3
        "add x26, x16, x12, LSL #2\n"
        "cntw x20, ALL, MUL #10\n"
        "ld1h { z28.s }, p1/Z, [x16]\n"
        "add x25, x16, x12, LSL #1\n"
        "add x24, x26, x12\n"
        "ld1h { z29.s }, p1/Z, [x16, #1, MUL VL]\n"
        "cmp %x[N], x20\n"
        "add x23, x16, x12\n"
        "ld1h { z4.s }, p1/Z, [x25]\n"
        "add x22, x25, x12\n"
        "csel x24, x24, x16, GT\n"
        "ld1h { z30.s }, p1/Z, [x23]\n"
        "fcvt z28.s, p1/m, z28.h\n"
        "ld1h { z31.s }, p1/Z, [x23, #1, MUL VL]\n"
        "fcvt z29.s, p1/m, z29.h\n"
        "mov x20, #0x2\n"
        "mov x11, %x[K]\n"
        "ld1h { z5.s }, p1/Z, [x25, #1, MUL VL]\n"
        "fcvt z4.s, p1/m, z4.h\n"
        "msub x21, x15, x20, %x[N]\n"
        "mov x10, %x[A_ptr]\n"
        "ld1h { z6.s }, p1/Z, [x22]\n"
        "fcvt z30.s, p1/m, z30.h\n"
        "lsl x20, %x[K], #0x1\n"
        ".inst 0x257547f0  // whilelt p8.h, XZR, x21, VLx2\n"
        "ld1h { z7.s }, p1/Z, [x22, #1, MUL VL]\n"
        "fcvt z31.s, p1/m, z31.h\n"
        "cmp x11, #0x8\n"
        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
        "ld1h { z8.s }, p1/Z, [x26]\n"
        "fcvt z5.s, p1/m, z5.h\n"
        "inch x16, ALL, MUL #2\n"
        "inch x23, ALL, MUL #2\n"
        "ld1h { z9.s }, p1/Z, [x26, #1, MUL VL]\n"
        "fcvt z6.s, p1/m, z6.h\n"
        "inch x25, ALL, MUL #2\n"
        "inch x22, ALL, MUL #2\n"
        "ld1h { z10.s }, p1/Z, [x24]\n"
        "fcvt z7.s, p1/m, z7.h\n"
        "inch x26, ALL, MUL #2\n"
        "ld1h { z11.s }, p1/Z, [x24, #1, MUL VL]\n"
        "fcvt z8.s, p1/m, z8.h\n"
        "inch x24, ALL, MUL #2\n"
        ".inst 0xc0040f80  // mova za.d[x8, #0], { z28.d-z31.d }\n"
        "fcvt z9.s, p1/m, z9.h\n"
        "fcvt z10.s, p1/m, z10.h\n"
        "fcvt z11.s, p1/m, z11.h\n"
        ".inst 0xc0040c81  // mova za.d[x8, #1], { z4.d-z7.d }\n"
        ".inst 0xc0040d02  // mova za.d[x8, #2], { z8.d-z11.d }\n"
        "ble 18f\n"
        "17:"  // Width 3: Multiply loop: Main loop head
        "whilelt p0.h, XZR, x11\n"
        ".inst 0xa040260d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        "ld1rqh { z4.h }, p0/Z, [x10]\n"
        "sub x11, x11, #0x8\n"
        "add x10, x10, #0x10\n"
        ".inst 0xa04026ef  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        "cmp x11, #0x8\n"
        ".inst 0xa0402731  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04026d3  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0402741  // ldnt1h { z0.h-z1.h }, pn9.b/Z, [x26]\n"
        ".inst 0xc1549188  // fdot za.s[x8, 0], { z12.h-z15.h }, z4.h[0]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0402703  // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1549209  // fdot za.s[x8, 1], { z16.h-z19.h }, z4.h[0]\n"
        ".inst 0xa0402611  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026f3  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc154900a  // fdot za.s[x8, 2], { z0.h-z3.h }, z4.h[0]\n"
        ".inst 0xa040272d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04026cf  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0402755  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x26]\n"
        ".inst 0xc1549608  // fdot za.s[x8, 0], { z16.h-z19.h }, z4.h[1]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0402717  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1549589  // fdot za.s[x8, 1], { z12.h-z15.h }, z4.h[1]\n"
        ".inst 0xa040260d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026ef  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc154968a  // fdot za.s[x8, 2], { z20.h-z23.h }, z4.h[1]\n"
        ".inst 0xa0402729  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04026cb  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0402751  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x26]\n"
        ".inst 0xc1549988  // fdot za.s[x8, 0], { z12.h-z15.h }, z4.h[2]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0402713  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1549909  // fdot za.s[x8, 1], { z8.h-z11.h }, z4.h[2]\n"
        ".inst 0xa0402619  // ldnt1h { z24.h-z25.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026fb  // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xc1549a0a  // fdot za.s[x8, 2], { z16.h-z19.h }, z4.h[2]\n"
        ".inst 0xa0402731  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04026d3  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa040274d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x26]\n"
        ".inst 0xc1549f08  // fdot za.s[x8, 0], { z24.h-z27.h }, z4.h[3]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa040270f  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1549e09  // fdot za.s[x8, 1], { z16.h-z19.h }, z4.h[3]\n"
        ".inst 0xc1549d8a  // fdot za.s[x8, 2], { z12.h-z15.h }, z4.h[3]\n"
        "bgt 17b\n"
        "18:"  // Width 3: Multiply loop: Single iteration only
        "whilelt p0.h, XZR, x11\n"
        ".inst 0xa0402605  // ldnt1h { z4.h-z5.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "ld1rqh { z3.h }, p0/Z, [x10]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026e7  // ldnt1h { z6.h-z7.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa040272d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04026cf  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0402759  // ldnt1h { z24.h-z25.h }, pn9.b/Z, [x26]\n"
        ".inst 0xc1539088  // fdot za.s[x8, 0], { z4.h-z7.h }, z3.h[0]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa040271b  // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1539189  // fdot za.s[x8, 1], { z12.h-z15.h }, z3.h[0]\n"
        ".inst 0xc153930a  // fdot za.s[x8, 2], { z24.h-z27.h }, z3.h[0]\n"
        "ble 19f\n"
        ".inst 0xa0402619  // ldnt1h { z24.h-z25.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026fb  // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0402729  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04026cb  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0402751  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x26]\n"
        ".inst 0xc1539708  // fdot za.s[x8, 0], { z24.h-z27.h }, z3.h[1]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0402713  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1539509  // fdot za.s[x8, 1], { z8.h-z11.h }, z3.h[1]\n"
        ".inst 0xc153960a  // fdot za.s[x8, 2], { z16.h-z19.h }, z3.h[1]\n"
        "ble 19f\n"
        ".inst 0xa040260d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa04026ef  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0402729  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa04026cb  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x22]\n"
        "addvl x22, x22, #2\n"
        ".inst 0xa0402745  // ldnt1h { z4.h-z5.h }, pn9.b/Z, [x26]\n"
        ".inst 0xc1539988  // fdot za.s[x8, 0], { z12.h-z15.h }, z3.h[2]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xa0402707  // ldnt1h { z6.h-z7.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xc1539909  // fdot za.s[x8, 1], { z8.h-z11.h }, z3.h[2]\n"
        ".inst 0xc153988a  // fdot za.s[x8, 2], { z4.h-z7.h }, z3.h[2]\n"
        "ble 19f\n"
        ".inst 0xa0402619  // ldnt1h { z24.h-z25.h }, pn9.b/Z, [x16]\n"
        ".inst 0xa04026fb  // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23]\n"
        ".inst 0xa040273d  // ldnt1h { z28.h-z29.h }, pn9.b/Z, [x25]\n"
        ".inst 0xa04026df  // ldnt1h { z30.h-z31.h }, pn9.b/Z, [x22]\n"
        ".inst 0xa040274d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x26]\n"
        ".inst 0xc1539f08  // fdot za.s[x8, 0], { z24.h-z27.h }, z3.h[3]\n"
        ".inst 0xa040270f  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x24]\n"
        ".inst 0xc1539f89  // fdot za.s[x8, 1], { z28.h-z31.h }, z3.h[3]\n"
        ".inst 0xc1539d8a  // fdot za.s[x8, 2], { z12.h-z15.h }, z3.h[3]\n"
        "19:"  // Width 3: Multiply loop: multiply skip
        "tbz %x[flags], #1, 20f\n"
        ".inst 0xc0060c18  // mova { z24.d-z27.d }, za.d[x8, #0]\n"
        "add x21, %x[args_ptr], %[offset_min]\n"
        "add x20, %x[args_ptr], %[offset_max]\n"
        ".inst 0xc0060c3c  // mova { z28.d-z31.d }, za.d[x8, #1]\n"
        "ld1rh { z19.h }, p1/Z, [x21]\n"
        ".inst 0xc0060c40  // mova { z0.d-z3.d }, za.d[x8, #2]\n"
        "ld1rh { z18.h }, p1/Z, [x20]\n"
        ".inst 0xc120e314  // fcvt z20.h, { z24.s-z25.s }\n"
        ".inst 0xc120e355  // fcvt z21.h, { z26.s-z27.s }\n"
        ".inst 0xc120e38e  // fcvt z14.h, { z28.s-z29.s }\n"
        ".inst 0xc120e3cf  // fcvt z15.h, { z30.s-z31.s }\n"
        ".inst 0xc172c274  // fclamp { z20.h-z21.h }, z19.h, z18.h\n"
        ".inst 0xc120e010  // fcvt z16.h, { z0.s-z1.s }\n"
        ".inst 0xc120e051  // fcvt z17.h, { z2.s-z3.s }\n"
        ".inst 0xc172c26e  // fclamp { z14.h-z15.h }, z19.h, z18.h\n"
        ".inst 0xc172c270  // fclamp { z16.h-z17.h }, z19.h, z18.h\n"
        ".inst 0xa06025d4  // st1h { z20.h-z21.h }, pn9.b, [x14]\n"
        ".inst 0xa06125ce  // st1h { z14.h-z15.h }, pn9.b, [x14, #0x2, MUL VL]\n"
        ".inst 0xa06221d0  // st1h { z16.h-z17.h }, p8, [x14, #0x4, MUL VL]\n"
        "b 21f\n"
        "20:"  // Width 3: No activation
        ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
        ".inst 0xc0060c28  // mova { z8.d-z11.d }, za.d[x8, #1]\n"
        ".inst 0xc0060c4c  // mova { z12.d-z15.d }, za.d[x8, #2]\n"
        ".inst 0xc120e091  // fcvt z17.h, { z4.s-z5.s }\n"
        ".inst 0xc120e0d9  // fcvt z25.h, { z6.s-z7.s }\n"
        ".inst 0xa16025d1  // st1h { z17.h, z25.h }, pn9.b, [x14]\n"
        ".inst 0xc120e112  // fcvt z18.h, { z8.s-z9.s }\n"
        ".inst 0xc120e153  // fcvt z19.h, { z10.s-z11.s }\n"
        ".inst 0xa06125d2  // st1h { z18.h-z19.h }, pn9.b, [x14, #0x2, MUL VL]\n"
        ".inst 0xc120e191  // fcvt z17.h, { z12.s-z13.s }\n"
        ".inst 0xc120e1d9  // fcvt z25.h, { z14.s-z15.s }\n"
        ".inst 0xa16221d1  // st1h { z17.h, z25.h }, p8, [x14, #0x4, MUL VL]\n"
        "21:"  // Width 3: Output done
        "b 28f\n"
        "22:"  // Width 4
        "add x9, x16, x12, LSL #2\n"
        "cntw x20, ALL, MUL #14\n"
        "ld1h { z12.s }, p1/Z, [x16]\n"
        "add x28, x9, x12, LSL #1\n"
        "add x27, x16, x12, LSL #1\n"
        "ld1h { z13.s }, p1/Z, [x16, #1, MUL VL]\n"
        "add x26, x28, x12\n"
        "cmp %x[N], x20\n"
        "ld1h { z8.s }, p1/Z, [x27]\n"
        "add x25, x16, x12\n"
        "add x24, x27, x12\n"
        "ld1h { z9.s }, p1/Z, [x27, #1, MUL VL]\n"
        "fcvt z12.s, p1/m, z12.h\n"
        "add x23, x9, x12\n"
        "csel x26, x26, x16, GT\n"
        "ld1h { z14.s }, p1/Z, [x25]\n"
        "fcvt z13.s, p1/m, z13.h\n"
        "ld1h { z15.s }, p1/Z, [x25, #1, MUL VL]\n"
        "fcvt z8.s, p1/m, z8.h\n"
        "mov x20, #0x3\n"
        "mov x11, %x[K]\n"
        "ld1h { z10.s }, p1/Z, [x24]\n"
        "fcvt z9.s, p1/m, z9.h\n"
        "msub x21, x15, x20, %x[N]\n"
        "mov x10, %x[A_ptr]\n"
        "ld1h { z11.s }, p1/Z, [x24, #1, MUL VL]\n"
        "fcvt z14.s, p1/m, z14.h\n"
        "lsl x20, %x[K], #0x1\n"
        ".inst 0x257547f0  // whilelt p8.h, XZR, x21, VLx2\n"
        "ld1h { z4.s }, p1/Z, [x9]\n"
        "fcvt z15.s, p1/m, z15.h\n"
        "cmp x11, #0x8\n"
        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
        "ld1h { z5.s }, p1/Z, [x9, #1, MUL VL]\n"
        "fcvt z10.s, p1/m, z10.h\n"
        "add x22, x16, x12, LSL #3\n"
        "inch x16, ALL, MUL #2\n"
        "ld1h { z6.s }, p1/Z, [x23]\n"
        "fcvt z11.s, p1/m, z11.h\n"
        "inch x25, ALL, MUL #2\n"
        "inch x27, ALL, MUL #2\n"
        "ld1h { z7.s }, p1/Z, [x23, #1, MUL VL]\n"
        "fcvt z4.s, p1/m, z4.h\n"
        "inch x24, ALL, MUL #2\n"
        "inch x9, ALL, MUL #2\n"
        "ld1h { z0.s }, p1/Z, [x28]\n"
        "fcvt z5.s, p1/m, z5.h\n"
        "inch x23, ALL, MUL #2\n"
        ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
        "ld1h { z1.s }, p1/Z, [x28, #1, MUL VL]\n"
        "fcvt z6.s, p1/m, z6.h\n"
        "inch x28, ALL, MUL #2\n"
        "ld1h { z2.s }, p1/Z, [x26]\n"
        "fcvt z7.s, p1/m, z7.h\n"
        ".inst 0xc0040d01  // mova za.d[x8, #1], { z8.d-z11.d }\n"
        "ld1h { z3.s }, p1/Z, [x26, #1, MUL VL]\n"
        "fcvt z0.s, p1/m, z0.h\n"
        "inch x26, ALL, MUL #2\n"
        "fcvt z1.s, p1/m, z1.h\n"
        "fcvt z2.s, p1/m, z2.h\n"
        "fcvt z3.s, p1/m, z3.h\n"
        ".inst 0xc0040c82  // mova za.d[x8, #2], { z4.d-z7.d }\n"
        ".inst 0xc0040c03  // mova za.d[x8, #3], { z0.d-z3.d }\n"
        "ble 24f\n"
        "23:"  // Width 4: Multiply loop: Main loop head
        "whilelt p0.h, XZR, x11\n"
        ".inst 0xa040260d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        "ld1rqh { z0.h }, p0/Z, [x10]\n"
        "sub x11, x11, #0x8\n"
        "add x10, x10, #0x10\n"
        ".inst 0xa040272f  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        "cmp x11, #0x8\n"
        ".inst 0xa0402765  // ldnt1h { z4.h-z5.h }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa0402707  // ldnt1h { z6.h-z7.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0402529  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x9]\n"
        ".inst 0xc1509188  // fdot za.s[x8, 0], { z12.h-z15.h }, z0.h[0]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04026eb  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa040278d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x28]\n"
        ".inst 0xc1509089  // fdot za.s[x8, 1], { z4.h-z7.h }, z0.h[0]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa040274f  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc150910a  // fdot za.s[x8, 2], { z8.h-z11.h }, z0.h[0]\n"
        ".inst 0xa0402609  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa040272b  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xc150918b  // fdot za.s[x8, 3], { z12.h-z15.h }, z0.h[0]\n"
        ".inst 0xa0402765  // ldnt1h { z4.h-z5.h }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa0402707  // ldnt1h { z6.h-z7.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa040252d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x9]\n"
        ".inst 0xc1509508  // fdot za.s[x8, 0], { z8.h-z11.h }, z0.h[1]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04026ef  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0402789  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x28]\n"
        ".inst 0xc1509489  // fdot za.s[x8, 1], { z4.h-z7.h }, z0.h[1]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa040274b  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc150958a  // fdot za.s[x8, 2], { z12.h-z15.h }, z0.h[1]\n"
        ".inst 0xa040260d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa040272f  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xc150950b  // fdot za.s[x8, 3], { z8.h-z11.h }, z0.h[1]\n"
        ".inst 0xa0402765  // ldnt1h { z4.h-z5.h }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa0402707  // ldnt1h { z6.h-z7.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0402529  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x9]\n"
        ".inst 0xc1509988  // fdot za.s[x8, 0], { z12.h-z15.h }, z0.h[2]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04026eb  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa040278d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x28]\n"
        ".inst 0xc1509889  // fdot za.s[x8, 1], { z4.h-z7.h }, z0.h[2]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa040274f  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc150990a  // fdot za.s[x8, 2], { z8.h-z11.h }, z0.h[2]\n"
        ".inst 0xa040261d  // ldnt1h { z28.h-z29.h }, pn9.b/Z, [x16]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa040273f  // ldnt1h { z30.h-z31.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xc150998b  // fdot za.s[x8, 3], { z12.h-z15.h }, z0.h[2]\n"
        ".inst 0xa0402769  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa040270b  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0402535  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x9]\n"
        ".inst 0xc1509f88  // fdot za.s[x8, 0], { z28.h-z31.h }, z0.h[3]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04026f7  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0402791  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x28]\n"
        ".inst 0xc1509d09  // fdot za.s[x8, 1], { z8.h-z11.h }, z0.h[3]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa0402753  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc1509e8a  // fdot za.s[x8, 2], { z20.h-z23.h }, z0.h[3]\n"
        ".inst 0xc1509e0b  // fdot za.s[x8, 3], { z16.h-z19.h }, z0.h[3]\n"
        "bgt 23b\n"
        "24:"  // Width 4: Multiply loop: Single iteration only
        "whilelt p0.h, XZR, x11\n"
        ".inst 0xa0402615  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "ld1rqh { z3.h }, p0/Z, [x10]\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa0402737  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa040276d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa040270f  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0402531  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x9]\n"
        ".inst 0xc1539288  // fdot za.s[x8, 0], { z20.h-z23.h }, z3.h[0]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04026f3  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa040279d  // ldnt1h { z28.h-z29.h }, pn9.b/Z, [x28]\n"
        ".inst 0xc1539189  // fdot za.s[x8, 1], { z12.h-z15.h }, z3.h[0]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa040275f  // ldnt1h { z30.h-z31.h }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc153920a  // fdot za.s[x8, 2], { z16.h-z19.h }, z3.h[0]\n"
        ".inst 0xc153938b  // fdot za.s[x8, 3], { z28.h-z31.h }, z3.h[0]\n"
        "ble 25f\n"
        ".inst 0xa0402609  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa040272b  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa0402765  // ldnt1h { z4.h-z5.h }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa0402707  // ldnt1h { z6.h-z7.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa040252d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x9]\n"
        ".inst 0xc1539508  // fdot za.s[x8, 0], { z8.h-z11.h }, z3.h[1]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04026ef  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0402795  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x28]\n"
        ".inst 0xc1539489  // fdot za.s[x8, 1], { z4.h-z7.h }, z3.h[1]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa0402757  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc153958a  // fdot za.s[x8, 2], { z12.h-z15.h }, z3.h[1]\n"
        ".inst 0xc153968b  // fdot za.s[x8, 3], { z20.h-z23.h }, z3.h[1]\n"
        "ble 25f\n"
        ".inst 0xa040260d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x16]\n"
        "subs x11, x11, #0x2\n"
        "addvl x16, x16, #2\n"
        ".inst 0xa040272f  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x25]\n"
        "addvl x25, x25, #2\n"
        ".inst 0xa0402769  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x27]\n"
        "addvl x27, x27, #2\n"
        ".inst 0xa040270b  // ldnt1h { z10.h-z11.h }, pn9.b/Z, [x24]\n"
        "addvl x24, x24, #2\n"
        ".inst 0xa0402535  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x9]\n"
        ".inst 0xc1539988  // fdot za.s[x8, 0], { z12.h-z15.h }, z3.h[2]\n"
        "addvl x9, x9, #2\n"
        ".inst 0xa04026f7  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x23]\n"
        "addvl x23, x23, #2\n"
        ".inst 0xa0402791  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x28]\n"
        ".inst 0xc1539909  // fdot za.s[x8, 1], { z8.h-z11.h }, z3.h[2]\n"
        "addvl x28, x28, #2\n"
        ".inst 0xa0402753  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x26]\n"
        "addvl x26, x26, #2\n"
        ".inst 0xc1539a8a  // fdot za.s[x8, 2], { z20.h-z23.h }, z3.h[2]\n"
        ".inst 0xc1539a0b  // fdot za.s[x8, 3], { z16.h-z19.h }, z3.h[2]\n"
        "ble 25f\n"
        ".inst 0xa0402605  // ldnt1h { z4.h-z5.h }, pn9.b/Z, [x16]\n"
        ".inst 0xa0402727  // ldnt1h { z6.h-z7.h }, pn9.b/Z, [x25]\n"
        ".inst 0xa040276d  // ldnt1h { z12.h-z13.h }, pn9.b/Z, [x27]\n"
        ".inst 0xa040270f  // ldnt1h { z14.h-z15.h }, pn9.b/Z, [x24]\n"
        ".inst 0xa0402531  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x9]\n"
        ".inst 0xc1539c88  // fdot za.s[x8, 0], { z4.h-z7.h }, z3.h[3]\n"
        ".inst 0xa04026f3  // ldnt1h { z18.h-z19.h }, pn9.b/Z, [x23]\n"
        ".inst 0xa0402795  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x28]\n"
        ".inst 0xc1539d89  // fdot za.s[x8, 1], { z12.h-z15.h }, z3.h[3]\n"
        ".inst 0xa0402757  // ldnt1h { z22.h-z23.h }, pn9.b/Z, [x26]\n"
        ".inst 0xc1539e0a  // fdot za.s[x8, 2], { z16.h-z19.h }, z3.h[3]\n"
        ".inst 0xc1539e8b  // fdot za.s[x8, 3], { z20.h-z23.h }, z3.h[3]\n"
        "25:"  // Width 4: Multiply loop: multiply skip
        "tbz %x[flags], #1, 26f\n"
        ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
        "add x21, %x[args_ptr], %[offset_min]\n"
        "add x20, %x[args_ptr], %[offset_max]\n"
        ".inst 0xc0060c2c  // mova { z12.d-z15.d }, za.d[x8, #1]\n"
        "ld1rh { z19.h }, p1/Z, [x21]\n"
        ".inst 0xc0060c40  // mova { z0.d-z3.d }, za.d[x8, #2]\n"
        "ld1rh { z18.h }, p1/Z, [x20]\n"
        ".inst 0xc0060c64  // mova { z4.d-z7.d }, za.d[x8, #3]\n"
        ".inst 0xc120e38a  // fcvt z10.h, { z28.s-z29.s }\n"
        ".inst 0xc120e3cb  // fcvt z11.h, { z30.s-z31.s }\n"
        ".inst 0xc120e18c  // fcvt z12.h, { z12.s-z13.s }\n"
        ".inst 0xc120e1cd  // fcvt z13.h, { z14.s-z15.s }\n"
        ".inst 0xc172c26a  // fclamp { z10.h-z11.h }, z19.h, z18.h\n"
        ".inst 0xc120e00e  // fcvt z14.h, { z0.s-z1.s }\n"
        ".inst 0xc120e04f  // fcvt z15.h, { z2.s-z3.s }\n"
        ".inst 0xc172c26c  // fclamp { z12.h-z13.h }, z19.h, z18.h\n"
        ".inst 0xc120e090  // fcvt z16.h, { z4.s-z5.s }\n"
        ".inst 0xc120e0d1  // fcvt z17.h, { z6.s-z7.s }\n"
        ".inst 0xc172c26e  // fclamp { z14.h-z15.h }, z19.h, z18.h\n"
        ".inst 0xc172c270  // fclamp { z16.h-z17.h }, z19.h, z18.h\n"
        ".inst 0xa06025ca  // st1h { z10.h-z11.h }, pn9.b, [x14]\n"
        ".inst 0xa06125cc  // st1h { z12.h-z13.h }, pn9.b, [x14, #0x2, MUL VL]\n"
        ".inst 0xa06225ce  // st1h { z14.h-z15.h }, pn9.b, [x14, #0x4, MUL VL]\n"
        ".inst 0xa06321d0  // st1h { z16.h-z17.h }, p8, [x14, #0x6, MUL VL]\n"
        "addvl x14, x14, #8\n"
        "b 27f\n"
        "26:"  // Width 4: No activation
        ".inst 0xc0060c0c  // mova { z12.d-z15.d }, za.d[x8, #0]\n"
        ".inst 0xc0060c30  // mova { z16.d-z19.d }, za.d[x8, #1]\n"
        ".inst 0xc0060c5c  // mova { z28.d-z31.d }, za.d[x8, #2]\n"
        ".inst 0xc0060c68  // mova { z8.d-z11.d }, za.d[x8, #3]\n"
        ".inst 0xc120e187  // fcvt z7.h, { z12.s-z13.s }\n"
        ".inst 0xc120e1cf  // fcvt z15.h, { z14.s-z15.s }\n"
        ".inst 0xa16025c7  // st1h { z7.h, z15.h }, pn9.b, [x14]\n"
        ".inst 0xc120e207  // fcvt z7.h, { z16.s-z17.s }\n"
        ".inst 0xc120e24f  // fcvt z15.h, { z18.s-z19.s }\n"
        ".inst 0xa16125c7  // st1h { z7.h, z15.h }, pn9.b, [x14, #0x2, MUL VL]\n"
        ".inst 0xc120e38e  // fcvt z14.h, { z28.s-z29.s }\n"
        ".inst 0xc120e3cf  // fcvt z15.h, { z30.s-z31.s }\n"
        ".inst 0xa06225ce  // st1h { z14.h-z15.h }, pn9.b, [x14, #0x4, MUL VL]\n"
        ".inst 0xc120e112  // fcvt z18.h, { z8.s-z9.s }\n"
        ".inst 0xc120e15a  // fcvt z26.h, { z10.s-z11.s }\n"
        ".inst 0xa16321d2  // st1h { z18.h, z26.h }, p8, [x14, #0x6, MUL VL]\n"
        "addvl x14, x14, #8\n"
        "27:"  // Width 4: Output done
        "subs x13, x13, #0x4\n"
        "mov x16, x22\n"
        "sub %x[N], %x[N], x15, LSL #2\n"
        "bgt 4b\n"
        "28:"  // Exit
        ".inst 0xd503467f  // SMSTOP\n"
        : [N] "+&r"(N)
        : [A_ptr] "r"(A_ptr), [B_ptr] "r"(B_ptr), [K] "r"(K), [args_ptr] "r"(&ka), [flags] "r"(flags),
          [offset_max] "I"(offsetof(KernelArgs, maxval)), [offset_min] "I"(offsetof(KernelArgs, minval)),
          [output_ptr] "r"(output_ptr)
        : "cc", "memory", "p0", "p1", "p10", "p11", "p12", "p13", "p14", "p15", "p2", "p3", "p4", "p5", "p6", "p7",
          "p8", "p9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
          "x27", "x28", "x8", "x9", "z0", "z1", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19",
          "z2", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z3", "z30", "z31", "z4", "z5",
          "z6", "z7", "z8", "z9");
}

#endif  // Architectural features check.
